library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
westwood <- read.csv("westwood.csv")
head(westwood) # make sure data is loaded
## name host_since
## 1 Private Master+Bath, Gym+Amenities; UCLA/Rotations 2013-03-20
## 2 Stylish 1BD Westwood Apartment 2011-11-09
## 3 Private Master bedroom by UCLA and century city- 2013-08-06
## 4 Small 1 Bedroom Loft Near UCLA on Westwood Blvd 2012-08-27
## 5 Vintage cozy room in West LA 2014-01-07
## 6 Spacious and Private 2Bed/2 Bathroom in Westwood 2014-01-08
## host_response_time host_response_rate host_acceptance_rate host_is_superhost
## 1 within a day 67% 50% f
## 2 N/A N/A 86% f
## 3 within a few hours 100% 100% f
## 4 within an hour 99% 77% f
## 5 within an hour 100% 100% f
## 6 within a few hours 100% 57% f
## host_neighbourhood host_total_listings_count
## 1 Westwood 1
## 2 Westwood 1
## 3 1
## 4 Westside 40
## 5 Westwood 1
## 6 Westwood 1
## host_verifications
## 1 ['email', 'phone', 'reviews', 'kba', 'work_email']
## 2 ['email', 'phone', 'facebook', 'reviews', 'jumio', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']
## 3 ['email', 'phone', 'google', 'reviews', 'kba', 'work_email']
## 4 ['email', 'phone', 'reviews', 'offline_government_id', 'sent_id', 'kba', 'selfie', 'government_id']
## 5 ['email', 'phone', 'facebook', 'reviews', 'jumio', 'government_id']
## 6 ['email', 'phone', 'reviews', 'kba']
## host_has_profile_pic host_identity_verified property_type room_type
## 1 t t Condominium Private room
## 2 t t Apartment Entire home/apt
## 3 t t Condominium Private room
## 4 t t Apartment Entire home/apt
## 5 t t Apartment Private room
## 6 t t Apartment Entire home/apt
## accommodates bathrooms bedrooms beds bed_type number_of_reviews
## 1 1 1 1 1 Real Bed 49
## 2 2 1 1 1 Real Bed 29
## 3 2 1 1 1 Real Bed 11
## 4 2 1 1 1 Real Bed 32
## 5 1 1 1 1 Real Bed 295
## 6 4 2 2 2 Real Bed 74
## review_scores_rating instant_bookable cancellation_policy
## 1 99 f strict_14_with_grace_period
## 2 97 f strict_14_with_grace_period
## 3 93 t moderate
## 4 79 f strict_14_with_grace_period
## 5 93 t strict_14_with_grace_period
## 6 98 f strict_14_with_grace_period
## reviews_per_month price
## 1 0.59 $70.00
## 2 0.37 $115.00
## 3 0.71 $75.00
## 4 0.52 $72.00
## 5 3.90 $75.00
## 6 0.97 $125.00
num_NA <- integer(0) # the number of empty values in each column
for (c in 1:ncol(westwood)){
westwood[which(westwood[,c]=="N/A"|westwood[,c]==""),c] <- NA
if (!is.null(levels(westwood[,c]))){
levels(westwood[,c])[which(levels(westwood[,c])=="N/A"|levels(westwood[,c])=="")] <- NA
}
num_NA[c] <- sum(is.na(westwood[,c]))
}
westwood[,4] <- as.numeric(str_extract(westwood[,4],"\\d+"))/100 # percent
westwood[,5] <- as.numeric(str_extract(westwood[,5],"\\d+"))/100 # percent
westwood[,6] <- ifelse(westwood[,6]=="f",0,1) # logical vector: 0 for FALSE, 1 for TRUE
# count for number of verifications
temp <- integer(530)
for (i in 1:length(westwood[,9])){
temp[i] <- length(str_extract_all(westwood[,9][i],",")[[1]])+1
}
westwood[,9] <- temp
westwood[,10] <- ifelse(westwood[,10]=="f",0,1) # logical vector: 0 for FALSE, 1 for TRUE
westwood[,11] <- ifelse(westwood[,11]=="f",0,1) # logical vector: 0 for FALSE, 1 for TRUE
westwood[,21] <- ifelse(westwood[,21]=="f",0,1) # logical vector: 0 for FALSE, 1 for TRUE
temp <- substr(westwood[,24],2,nchar(as.character(westwood[,24])))
westwood[,24] <- as.numeric(str_replace_all(temp, ",", "")) # clean price
# For host_since, convert into days till the last time this dataset being measured (08 May, 2020)
westwood[,2] <- as.numeric(difftime(as.Date("2020-05-08"),as.Date(westwood[,2])))
colnames(westwood)[2] <- "host_has_been"
westwood <- westwood[,-7] # remove host_neighbourhood
westwood <- westwood[,-c(1,3,4,5,11)]
westwood$room_type<-factor(westwood$room_type)
westwood$bed_type<-factor(westwood$bed_type)
westwood$cancellation_policy<-factor(westwood$cancellation_policy)
westwood$host_is_superhost<-factor(westwood$host_is_superhost)
westwood$host_has_profile_pic<-factor(westwood$host_has_profile_pic)
westwood$host_identity_verified<-factor(westwood$host_identity_verified)
westwood$instant_bookable<-factor(westwood$instant_bookable)
head(westwood) # make sure data is cleaned
## host_has_been host_is_superhost host_total_listings_count host_verifications
## 1 2606 0 1 5
## 2 3103 0 1 9
## 3 2467 0 1 6
## 4 2811 0 40 8
## 5 2313 0 1 6
## 6 2312 0 1 4
## host_has_profile_pic host_identity_verified room_type accommodates
## 1 1 1 Private room 1
## 2 1 1 Entire home/apt 2
## 3 1 1 Private room 2
## 4 1 1 Entire home/apt 2
## 5 1 1 Private room 1
## 6 1 1 Entire home/apt 4
## bathrooms bedrooms beds bed_type number_of_reviews review_scores_rating
## 1 1 1 1 Real Bed 49 99
## 2 1 1 1 Real Bed 29 97
## 3 1 1 1 Real Bed 11 93
## 4 1 1 1 Real Bed 32 79
## 5 1 1 1 Real Bed 295 93
## 6 2 2 2 Real Bed 74 98
## instant_bookable cancellation_policy reviews_per_month price
## 1 0 strict_14_with_grace_period 0.59 70
## 2 0 strict_14_with_grace_period 0.37 115
## 3 1 moderate 0.71 75
## 4 0 strict_14_with_grace_period 0.52 72
## 5 1 strict_14_with_grace_period 3.90 75
## 6 0 strict_14_with_grace_period 0.97 125
Heads up: 1. these variables have more than 20% NA values: “host_response_time” “host_response_rate” “review_scores_rating” “reviews_per_month” 2. these varaibles have relatively high (10%-20%) NA values: “host_acceptance_rate” 3. Variable 12) there are 13 levels (simplify? Might be too long and overwhelming in the model).
full_model<-lm(price~.,data=westwood)
plot(full_model)
## Warning: not plotting observations with leverage one:
## 16, 32
plot(price~.,data=westwood)
#mmps(full_model)
Look at diagnostic plots + Marginal Model Plot for each transformation 1) Transform Y and keep the predictors the same (Inverse Reg + Box Cox) 2) Keep Y and transform the predictors (Box Cox) 3) Transform both Y and the X’s (Box Cox)
#the following code is a template, need to plug in actual values/variables
#Step 1: Transform Y
#inverse regression
library(alr3)
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
invResPlot(full_model)
## lambda RSS
## 1 -0.002650926 4938471
## 2 -1.000000000 6611056
## 3 0.000000000 4938502
## 4 1.000000000 9296745
The inverse response plot approach suggests that the best transformation for the response variable is log(Y).
#box cox
summary(powerTransform(full_model))
## bcPower Transformation to Normality
## Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## Y1 -0.3365 -0.33 -0.4137 -0.2593
##
## Likelihood ratio test that transformation parameter is equal to 0
## (log transformation)
## LRT df pval
## LR test, lambda = (0) 87.78134 1 < 2.22e-16
##
## Likelihood ratio test that no transformation is needed
## LRT df pval
## LR test, lambda = (1) 1805.706 1 < 2.22e-16
The box-cox approach suggests that the best transformation of the response variable is Y^-0.33.
#log(Y)
library(dplyr)
westwood_t1<-mutate(westwood, logprice=log(price))
westwood_t1<-westwood_t1[,-18]
plot(logprice~.,data=westwood_t1)
log_model<-lm(logprice~.,data=westwood_t1)
mmps(log_model)
## Warning in mmps(log_model): Interactions and/or factors skipped
par(mfrow=c(2,2))
plot(log_model)
## Warning: not plotting observations with leverage one:
## 16, 32
#Y^-(1/3)
westwood_t2<-mutate(westwood, tprice=price^-(1/3))
westwood_t2<-westwood_t2[,-18]
plot(tprice~.,data=westwood_t2)
par(mfrow=c(2,2))
y2_model<-lm(tprice~.,data=westwood_t2)
plot(y2_model)
## Warning: not plotting observations with leverage one:
## 16, 32
mmps(y2_model)
## Warning in mmps(y2_model): Interactions and/or factors skipped